Sys.Date()
## [1] "2020-12-18"
Authors: Hanna Klimczak, Kamil Pluciński
library(plotly)
library(knitr)
library(kableExtra)
library(caret)
To ensure that running the notebook will always return the same output, we set seed to 0.
set.seed(0)
data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Afghanistan | 2015 | Developing | 65.0 | 263 | 62 | 0.01 | 71.279624 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 17.3 | 0.479 | 10.1 |
| Afghanistan | 2014 | Developing | 59.9 | 271 | 64 | 0.01 | 73.523582 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 17.5 | 0.476 | 10.0 |
| Afghanistan | 2013 | Developing | 59.9 | 268 | 66 | 0.01 | 73.219243 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 17.7 | 0.470 | 9.9 |
| Afghanistan | 2012 | Developing | 59.5 | 272 | 69 | 0.01 | 78.184215 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 18.0 | 0.463 | 9.8 |
| Afghanistan | 2011 | Developing | 59.2 | 275 | 71 | 0.01 | 7.097109 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 18.2 | 0.454 | 9.5 |
| Afghanistan | 2010 | Developing | 58.8 | 279 | 74 | 0.01 | 79.679367 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 18.4 | 0.448 | 9.2 |
nrow(data)
## [1] 2938
kable(summary(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:2938 | Min. :2000 | Length:2938 | Min. :36.30 | Min. : 1.0 | Min. : 0.0 | Min. : 0.0100 | Min. : 0.000 | Min. : 1.00 | Min. : 0.0 | Min. : 1.00 | Min. : 0.00 | Min. : 3.00 | Min. : 0.370 | Min. : 2.00 | Min. : 0.100 | Min. : 1.68 | Min. :3.400e+01 | Min. : 0.10 | Min. : 0.10 | Min. :0.0000 | Min. : 0.00 | |
| Class :character | 1st Qu.:2004 | Class :character | 1st Qu.:63.10 | 1st Qu.: 74.0 | 1st Qu.: 0.0 | 1st Qu.: 0.8775 | 1st Qu.: 4.685 | 1st Qu.:77.00 | 1st Qu.: 0.0 | 1st Qu.:19.30 | 1st Qu.: 0.00 | 1st Qu.:78.00 | 1st Qu.: 4.260 | 1st Qu.:78.00 | 1st Qu.: 0.100 | 1st Qu.: 463.94 | 1st Qu.:1.958e+05 | 1st Qu.: 1.60 | 1st Qu.: 1.50 | 1st Qu.:0.4930 | 1st Qu.:10.10 | |
| Mode :character | Median :2008 | Mode :character | Median :72.10 | Median :144.0 | Median : 3.0 | Median : 3.7550 | Median : 64.913 | Median :92.00 | Median : 17.0 | Median :43.50 | Median : 4.00 | Median :93.00 | Median : 5.755 | Median :93.00 | Median : 0.100 | Median : 1766.95 | Median :1.387e+06 | Median : 3.30 | Median : 3.30 | Median :0.6770 | Median :12.30 | |
| NA | Mean :2008 | NA | Mean :69.22 | Mean :164.8 | Mean : 30.3 | Mean : 4.6029 | Mean : 738.251 | Mean :80.94 | Mean : 2419.6 | Mean :38.32 | Mean : 42.04 | Mean :82.55 | Mean : 5.938 | Mean :82.32 | Mean : 1.742 | Mean : 7483.16 | Mean :1.275e+07 | Mean : 4.84 | Mean : 4.87 | Mean :0.6276 | Mean :11.99 | |
| NA | 3rd Qu.:2012 | NA | 3rd Qu.:75.70 | 3rd Qu.:228.0 | 3rd Qu.: 22.0 | 3rd Qu.: 7.7025 | 3rd Qu.: 441.534 | 3rd Qu.:97.00 | 3rd Qu.: 360.2 | 3rd Qu.:56.20 | 3rd Qu.: 28.00 | 3rd Qu.:97.00 | 3rd Qu.: 7.492 | 3rd Qu.:97.00 | 3rd Qu.: 0.800 | 3rd Qu.: 5910.81 | 3rd Qu.:7.420e+06 | 3rd Qu.: 7.20 | 3rd Qu.: 7.20 | 3rd Qu.:0.7790 | 3rd Qu.:14.30 | |
| NA | Max. :2015 | NA | Max. :89.00 | Max. :723.0 | Max. :1800.0 | Max. :17.8700 | Max. :19479.912 | Max. :99.00 | Max. :212183.0 | Max. :87.30 | Max. :2500.00 | Max. :99.00 | Max. :17.600 | Max. :99.00 | Max. :50.600 | Max. :119172.74 | Max. :1.294e+09 | Max. :27.70 | Max. :28.60 | Max. :0.9480 | Max. :20.70 | |
| NA | NA | NA | NA’s :10 | NA’s :10 | NA | NA’s :194 | NA | NA’s :553 | NA | NA’s :34 | NA | NA’s :19 | NA’s :226 | NA’s :19 | NA | NA’s :448 | NA’s :652 | NA’s :34 | NA’s :34 | NA’s :167 | NA’s :163 |
As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.
data_new <- data[complete.cases(data[ , 'Life.expectancy']),]
After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.
na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")
for (col in na_columns){
m <- median(data_new[ , col], na.rm=TRUE)
print(col)
print(m)
data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
kable(summary(data_new), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:2928 | Min. :2000 | Length:2928 | Min. :36.30 | Min. : 1.0 | Min. : 0.00 | Min. : 0.010 | Min. : 0.000 | Min. : 1.00 | Min. : 0.0 | Min. : 1.00 | Min. : 0.00 | Min. : 3.00 | Min. : 0.370 | Min. : 2.00 | Min. : 0.100 | Min. : 1.68 | Min. :3.400e+01 | Min. : 0.100 | Min. : 0.100 | Min. :0.0000 | Min. : 0.00 | |
| Class :character | 1st Qu.:2004 | Class :character | 1st Qu.:63.10 | 1st Qu.: 74.0 | 1st Qu.: 0.00 | 1st Qu.: 1.107 | 1st Qu.: 4.854 | 1st Qu.:82.00 | 1st Qu.: 0.0 | 1st Qu.:19.40 | 1st Qu.: 0.00 | 1st Qu.:78.00 | 1st Qu.: 4.370 | 1st Qu.:78.00 | 1st Qu.: 0.100 | 1st Qu.: 578.80 | 1st Qu.:4.181e+05 | 1st Qu.: 1.600 | 1st Qu.: 1.600 | 1st Qu.:0.5040 | 1st Qu.:10.30 | |
| Mode :character | Median :2008 | Mode :character | Median :72.10 | Median :144.0 | Median : 3.00 | Median : 3.770 | Median : 65.611 | Median :92.00 | Median : 17.0 | Median :43.35 | Median : 4.00 | Median :93.00 | Median : 5.750 | Median :93.00 | Median : 0.100 | Median : 1764.97 | Median :1.392e+06 | Median : 3.300 | Median : 3.400 | Median :0.6770 | Median :12.30 | |
| NA | Mean :2008 | NA | Mean :69.22 | Mean :164.8 | Mean : 30.41 | Mean : 4.559 | Mean : 740.321 | Mean :83.05 | Mean : 2427.9 | Mean :38.29 | Mean : 42.18 | Mean :82.62 | Mean : 5.916 | Mean :82.39 | Mean : 1.748 | Mean : 6627.39 | Mean :1.026e+07 | Mean : 4.834 | Mean : 4.865 | Mean :0.6301 | Mean :12.02 | |
| NA | 3rd Qu.:2011 | NA | 3rd Qu.:75.70 | 3rd Qu.:228.0 | 3rd Qu.: 22.00 | 3rd Qu.: 7.400 | 3rd Qu.: 442.614 | 3rd Qu.:96.00 | 3rd Qu.: 362.2 | 3rd Qu.:56.10 | 3rd Qu.: 28.00 | 3rd Qu.:97.00 | 3rd Qu.: 7.330 | 3rd Qu.:97.00 | 3rd Qu.: 0.800 | 3rd Qu.: 4793.63 | 3rd Qu.:4.593e+06 | 3rd Qu.: 7.100 | 3rd Qu.: 7.200 | 3rd Qu.:0.7730 | 3rd Qu.:14.10 | |
| NA | Max. :2015 | NA | Max. :89.00 | Max. :723.0 | Max. :1800.00 | Max. :17.870 | Max. :19479.912 | Max. :99.00 | Max. :212183.0 | Max. :77.60 | Max. :2500.00 | Max. :99.00 | Max. :17.600 | Max. :99.00 | Max. :50.600 | Max. :119172.74 | Max. :1.294e+09 | Max. :27.700 | Max. :28.600 | Max. :0.9480 | Max. :20.70 |
As we can see, we have successfully dealt with missing values.
names(data_new)
## [1] "Country" "Year"
## [3] "Status" "Life.expectancy"
## [5] "Adult.Mortality" "infant.deaths"
## [7] "Alcohol" "percentage.expenditure"
## [9] "Hepatitis.B" "Measles"
## [11] "BMI" "under.five.deaths"
## [13] "Polio" "Total.expenditure"
## [15] "Diphtheria" "HIV.AIDS"
## [17] "GDP" "Population"
## [19] "thinness..1.19.years" "thinness.5.9.years"
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years", "Income.composition.of.resources", "Schooling")
plots <- lapply(quantitive_cols, function(var) {
plot_ly(y = data_new[, var], type = "box", name=var, quartilemethod="exclusive")
})
fig <- subplot(plots, nrows=4, margin=0.05)
fig <- fig %>% layout(legend = list(orientation = 'h'))
fig
plots <- lapply(quantitive_cols, function(var) {
plot_ly(x = data_new[, var], name=var)%>%
add_histogram()
})
fig <- subplot(plots, nrows=4, margin=0.05)
fig <- fig %>% layout(legend = list(orientation = 'h'))
fig
p1 <- plot_ly(data_new, x = ~Status) %>%
add_histogram()
p1
correlation_data <- data_new[quantitive_cols]
fig <- plot_ly(
x = quantitive_cols,
y = quantitive_cols,
z = cor(correlation_data), type = "heatmap"
)
fig
The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).
For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.
The following plot is fully interactive - hover over selected line to see a specific value, select one country by double clicking on it in the legend. In one country view, you can then single click on one country to add it to the plot. If you want to return to the view of all countries - double click on the legend again.
countries <- unique(data_new$Country)
fig <- plot_ly(data, x = data_new[data_new$Country == 'Afghanistan', 'Year'])
for(name in countries){
fig <- fig %>% add_trace(y = data_new[data_new$Country == name, 'Life.expectancy'], name = name, type = 'scatter', mode = 'lines')
}
fig <- fig %>% layout(legend = list(orientation = 'h'))
fig
regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths", "Year")]
kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Status | Life.expectancy | Adult.Mortality | Alcohol | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Developing | 65.0 | 263 | 0.01 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 0.479 | 10.1 |
| Developing | 59.9 | 271 | 0.01 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 0.476 | 10.0 |
| Developing | 59.9 | 268 | 0.01 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 0.470 | 9.9 |
| Developing | 59.5 | 272 | 0.01 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 0.463 | 9.8 |
| Developing | 59.2 | 275 | 0.01 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 0.454 | 9.5 |
| Developing | 58.8 | 279 | 0.01 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 0.448 | 9.2 |
Train test valid split
trainval_partition <-
createDataPartition(
y = regression_data$Life.expectancy,
p = .8,
list = FALSE)
trainval_data <- regression_data[ trainval_partition,]
test_data <- regression_data[-trainval_partition,]
train_partition <-
createDataPartition(
y = trainval_data$Life.expectancy,
p = .8,
list = FALSE)
train_data <- trainval_data[ train_partition,]
val_data <- trainval_data[-train_partition,]
control <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5)
linear <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "lm")
linear
## Linear Regression
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.232504 0.8022808 3.159272
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lasso <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "lasso")
lasso
## The lasso
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 8.534207 0.6981532 6.941555
## 0.5 5.498991 0.7556058 4.146438
## 0.9 4.251140 0.8009618 3.155516
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "ridge")
ridge
## Ridge Regression
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 4.228416 0.8031364 3.159813
## 1e-04 4.228416 0.8031369 3.159841
## 1e-01 4.275729 0.8030123 3.230414
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.
ggplot(varImp(ridge))